*************************************************************************************************
/*																								*
	Purpose: Clean raw data							                        
	Creates: NFS70_analysis.dta												
	Note: Universe is ever-married women under 45 years of age.             
*/																								*
*************************************************************************************************

clear 
set more off

cd "$Mydirectory1/1_DataSources/FertilityStudy_1970/"

	use ./RawData/20003-0001-Data.dta //download from ICPSR. Search ICPSR 20003. Name is the exact same.
	quietly run ./Rawdata/20003-0001-Supplemental_syntax.do //download from ICPSR

*------------------------------------------------------------------------------*
*------------------------------------------------------------------------------*

rename INT_LAN_REG region
rename EMP_TOTALFAM faminc_raw
rename AGE age_bin
rename DOB dob_raw
rename PER_RACE race_raw
rename RES_CHILD stateR_childhood
rename ED_TOTAL eduR
rename EMP_NOW empstatusR
rename EMP_FATHOCC fatherocc
rename MAR_STAT maritalstatus
rename H_EMP_OCC occ_sp
rename EMP_OCC occR
rename PER_OLDB R_num_olderbrothers
rename PER_YNGB R_num_youngerbrothers
rename PER_OLDS R_num_oldersisters
rename PER_YNGS R_num_youngersisters
rename CH_HAVE R_numkids_ever

*------------------------------------------------------------------------------*
*------------------------------------------------------------------------------*

*********************
*** RESTRICT SAMPLE
*********************

** Foreignborn
	gen foreignborn = stateR_childhood==4
	label var foreignborn "Respondent did not grow up in US"
	
* Keep native-born
	keep if (foreignborn==0 | foreignborn==.) 
	
* Age restriction 
	tab age_bin
	keep if age_bin>=5
	
*------------------------------------------------------------------------------*

*********************
*** FAMILY INCOME
*********************

	tab faminc_raw, m
	gen fam_inc=.
	replace fam_inc=0.75*2000 if faminc_raw==1 
	replace fam_inc=2500 if faminc_raw==2 
	replace fam_inc=3500 if faminc_raw==3 
	replace fam_inc=4500 if faminc_raw==4 
	replace fam_inc=5500 if faminc_raw==5 
	replace fam_inc=6500 if faminc_raw==6 
	replace fam_inc=7500 if faminc_raw==7 
	replace fam_inc=8500 if faminc_raw==8
	replace fam_inc=9500 if faminc_raw==9
	replace fam_inc=11000 if faminc_raw==10
	replace fam_inc=13500 if faminc_raw==11
	replace fam_inc=1.25*15000 if faminc_raw==12
	label var fam_inc "Family income, midpoints"
	
	/*
		Note: For married women, sometimes income info is missing for faminc_raw 
	    but is available in other variables. Will use these other variables
	    to recover income info.
	*/
	count if faminc_raw==. & maritalstatus==1
	tab EMP_INC70 if faminc_raw==. & maritalstatus==1
	
	replace fam_inc=.75*2000 if EMP_INC70==1 & (fam_inc==. & maritalstatus==1)
	replace fam_inc=2500 if EMP_INC70==2 & (fam_inc==. & maritalstatus==1)
	replace fam_inc=3500 if EMP_INC70==3 & (fam_inc==. & maritalstatus==1)
	replace fam_inc=4500 if EMP_INC70==4 & (fam_inc==. & maritalstatus==1)
	replace fam_inc=5500 if EMP_INC70==5 & (fam_inc==. & maritalstatus==1)
	replace fam_inc=6500 if EMP_INC70==6 & (fam_inc==. & maritalstatus==1)
	
	/*
		Note: All women who are not currently married are missing fam_inc_raw.
		Will use other income variables to recover income info when possible.
	*/	
	tab EMP_TOTALINC if maritalstatus==2, nol
	replace fam_inc=.75*2000 if EMP_TOTALINC==1 & (fam_inc==. & maritalstatus==2)
	replace fam_inc=2500 if EMP_TOTALINC==2 & (fam_inc==. & maritalstatus==2)
	replace fam_inc=3500 if EMP_TOTALINC==3 & (fam_inc==. & maritalstatus==2)
	replace fam_inc=4500 if EMP_TOTALINC==4 & (fam_inc==. & maritalstatus==2)
	replace fam_inc=5500 if EMP_TOTALINC==5 & (fam_inc==. & maritalstatus==2)
	replace fam_inc=6500 if EMP_TOTALINC==6 & (fam_inc==. & maritalstatus==2)
	replace fam_inc=7500 if EMP_TOTALINC==7 & (fam_inc==. & maritalstatus==2)
	replace fam_inc=8500 if EMP_TOTALINC==8 & (fam_inc==. & maritalstatus==2)
	replace fam_inc=9500 if EMP_TOTALINC==9 & (fam_inc==. & maritalstatus==2)
	replace fam_inc=11000 if EMP_TOTALINC==10 & (fam_inc==. & maritalstatus==2)
	replace fam_inc=13500 if EMP_TOTALINC==11 & (fam_inc==. & maritalstatus==2)
	replace fam_inc=1.25*15000 if EMP_TOTALINC==12 & (fam_inc==. & maritalstatus==2)
	
	replace fam_inc=.75*2000 if EMP_TOTFAM70==1 & maritalstatus==2 & fam_inc==.
	replace fam_inc=2500 if EMP_TOTFAM70==2 & maritalstatus==2 & fam_inc==.
	replace fam_inc=3500 if EMP_TOTFAM70==3 & maritalstatus==2 & fam_inc==.
	replace fam_inc=4500 if EMP_TOTFAM70==4 & maritalstatus==2 & fam_inc==.
	replace fam_inc=5500 if EMP_TOTFAM70==5 & maritalstatus==2 & fam_inc==.
	replace fam_inc=6500 if EMP_TOTFAM70==6 & maritalstatus==2 & fam_inc==.
	replace fam_inc=7500 if EMP_TOTFAM70==7 & maritalstatus==2 & fam_inc==.
	replace fam_inc=8500 if EMP_TOTFAM70==8 & maritalstatus==2 & fam_inc==.
	replace fam_inc=9500 if EMP_TOTFAM70==9 & maritalstatus==2 & fam_inc==.
	replace fam_inc=11000 if EMP_TOTFAM70==10 & maritalstatus==2 & fam_inc==.
	replace fam_inc=13500 if EMP_TOTFAM70==11 & maritalstatus==2 & fam_inc==.
	replace fam_inc=1.25*15000 if EMP_TOTFAM70==12 & maritalstatus==2 & fam_inc==.
	
	replace fam_inc=.75*2000 if EMP_INC70==1 & maritalstatus==2 & fam_inc==.
	replace fam_inc=2500 if EMP_INC70==2 & maritalstatus==2 & fam_inc==.
	replace fam_inc=3500 if EMP_INC70==3 & maritalstatus==2 & fam_inc==.
	replace fam_inc=4500 if EMP_INC70==4 & maritalstatus==2 & fam_inc==.
	replace fam_inc=5500 if EMP_INC70==5 & maritalstatus==2 & fam_inc==.
	replace fam_inc=6500 if EMP_INC70==6 & maritalstatus==2 & fam_inc==.
	replace fam_inc=7500 if EMP_INC70==7 & maritalstatus==2 & fam_inc==.
	replace fam_inc=8500 if EMP_INC70==8 & maritalstatus==2 & fam_inc==.
	replace fam_inc=9500 if EMP_INC70==9 & maritalstatus==2 & fam_inc==.
	replace fam_inc=11000 if EMP_INC70==10 & maritalstatus==2 & fam_inc==.
	replace fam_inc=13500 if EMP_INC70==11 & maritalstatus==2 & fam_inc==.
	replace fam_inc=1.25*15000 if EMP_INC70==12 & maritalstatus==2 & fam_inc==.
	
/*
	Note: The suffix "_son" is used to match the variable names in other datasets. 
	      All respondents (i.e., female) are given a value for these variables. 
*/	
	gen bottomcoded_son = fam_inc==.75*2000 if fam_inc<.
	gen topcoded_son = fam_inc==1.25*15000 if fam_inc<.

	label var bottomcoded_son "Respondent family income, bottom coded"
	label var topcoded_son "Respondent family income, top coded"
	
	/* We turn fam_inc into 1950 dollars using the CPI: https://data.bls.gov/timeseries/CUUR0000SA0 */
	gen CPI1950 = 24.1
	gen CPI1970 = 38.8

	gen fam_inc_real =.
	replace fam_inc_real = fam_inc * (CPI1950/CPI1970) 
	label var fam_inc_real "Family income, in 1950 dollars"
	
	gen lnfaminc = ln(fam_inc_real)
	label var lnfaminc "Logged family income"
	
*------------------------------------------------------------------------------*
*------------------------------------------------------------------------------*
	
*********************
*** DEMOGRAPHICS
*********************

* Sex
	gen female=1
	label var female "Respondent is female"

*Year
	gen year=1970
	label var year "Survey year"
	
** Race
	gen race=.
	replace race=1 if race_raw==2
	replace race=2 if race_raw==1
	replace race=. if race_raw==3
	label var race "Respondent race"
	
	gen black = race==2 if race<.
	gen white = race==1 if race<.

** South
	gen bornsouth = stateR_childhood==1 | stateR_childhood==2 if stateR_childhood<.
	label var bornsouth "R spent childhood in South"
	
* Region
	gen region4=.
	replace region4=1 if region==1 | region==2
	replace region4=2 if region==3 | region==4
	replace region4=3 if region==5 | region==6 | region==7
	replace region4=4 if region==8 | region==9
	
	gen moved_inout_south =.
	replace moved_inout_south =1 if region4!=3 & bornsouth==1 //left south
	replace moved_inout_south =0 if region4==3 & bornsouth==1 //stayed in south
	replace moved_inout_south =1 if region4==3 & stateR_childhood==3 //moved into south from outside
	label var moved_inout_south "Moved in or out of South"

** Marital 
	gen married = maritalstatus==1 
	tab married, m
	label var married "Respondent married"
	
** Widowed 
	gen widowed = (MAR_CUREND==1) 
	tab widowed, m
	
** Divorced 
	gen divorced = (MAR_CUREND==2)
	tab divorced, m
	
** Separated 
	gen separated = (MAR_CUREND==3)
	tab separated, m 

** Never-married--note: study only includes women who have ever been married.
	gen never_married=0
	
* Education for respondent
	tab eduR, m
	replace eduR = eduR+1 if eduR>1 //Note: Coding scheme is off by 1 for all individuals with at least some high school education.
	tab eduR, m nol
	
* Correct value label for adjustment to eduR coding in previous step.
	label define ED_TOTAL ///
    1   "1-8, elementary" ///
    3   "1-3, high school" ///
    4   "4, high school " ///
    5   "1-3, college" ///
    6   "4+ college" ///
	, add modify
	tab eduR,m 
	tab eduR, m nol
	
	gen yrsschool_bin=.
	replace yrsschool_bin=6 if eduR==1
	replace yrsschool_bin=10 if eduR==3
	replace yrsschool_bin=12 if eduR==4
	replace yrsschool_bin=14 if eduR==5
	replace yrsschool_bin=16 if eduR==6
	label var yrsschool_bin "Years of school, binned (missing less than grade 8)"
	tab yrsschool_bin, m
	
	gen hs_ed = eduR>=4 if eduR<.
	tab hs_ed, m 
	
	gen coll_ed = eduR>=6 if eduR<.
	tab coll_ed, m 
	
	label var hs_ed "HS educated" 
	label var coll_ed "College educated"
	
* Employment of respondent
	tab empstatusR
	gen employed = empstatusR==1 if empstatusR<.
	label var employed "Respondent is employed"

************
* Siblings *
************

*# brothers
	gen R_num_brothers = R_num_olderbrothers + R_num_youngerbrothers if (R_num_olderbrothers!=. & R_num_youngerbrothers!=.)
	replace R_num_brothers = R_num_olderbrothers if (R_num_olderbrothers!=. & R_num_youngerbrothers==.)
	replace R_num_brothers = R_num_youngerbrothers if (R_num_olderbrothers==. & R_num_youngerbrothers!=.)
	tab R_num_brothers, m  
	label var R_num_brothers "# of R's brothers"

*# sisters
	gen R_num_sisters = R_num_oldersisters + R_num_youngersisters if (R_num_oldersisters!=. & R_num_youngersisters!=.)
	replace R_num_sisters = R_num_oldersisters if (R_num_oldersisters!=. & R_num_youngersisters==.)
	replace R_num_sisters = R_num_youngersisters if (R_num_oldersisters==. & R_num_youngersisters!=.)
	tab R_num_sisters, m  
	label var R_num_sisters "# of R's sisters"

	foreach s in brothers sisters {
		gen flag_8plusolder`s' = R_num_older`s'==8 if R_num_older`s'<.
		tab R_num_older`s',m 
		tab flag_8plusolder`s', m
		label var flag_8plusolder`s' "Dummy =1 if R has indeterminate (high) # of older `s')"
		
		gen flag_8plusyounger`s' = R_num_younger`s'==8 if R_num_younger`s'<.
		tab R_num_younger`s', m
		tab flag_8plusyounger`s', m
		label var flag_8plusyounger`s' "Dummy =1 if R has indeterminate (high) # of younger `s')"
	}
	
*# of siblings
	gen R_num_siblings = R_num_sisters + R_num_brothers if (R_num_sisters!=. & R_num_brothers!=.)
	replace R_num_siblings = R_num_sisters if (R_num_sisters!=. & R_num_brothers==.)
	replace R_num_siblings = R_num_brothers if (R_num_sisters==. & R_num_brothers!=.)
	tab R_num_siblings, m
	label var R_num_siblings "# of R's siblings"

*****************
* Own fertility *
*****************

* # (living) boys--not available
* Flag indeterminate (high) # of boys--not available
* # (living) girls--not available
* Flag indeterminate (high) # of girls--not available
* # of living kids--not available
* # of deceased kids--not available
* # of kids ever--already coded up

* Dummy: has R ever had kids
	gen R_kids_ever = R_numkids_ever!=0 if R_numkids_ever<.
	tab R_kids_ever,m 
	label var R_kids_ever "Dummy=1 if R has ever had kids"

* Dummy: does R have kids right now?--not available

*********************************
* # of persons in R's household *
*********************************

* # of kids (of any age) living in R's hh--not available
* # of children 0-17 living in R's hh--not available
* total # of people living in R's household--not available

*------------------------------------------------------------------------------*
*------------------------------------------------------------------------------*

*********************
**** HUSBAND'S OCCUPATION
*********************

	rename occ_sp census1970 

* Crosswalk here the 1970 occupations to the ANES occs (merge adds one variable: fatheroccej)
	merge m:1 census1970 using ../Crosswalks/Crosswalk_1970Census_toANES.dta
		assert census1970==. if _merge==1
		drop if _merge==2
		drop _merge
	
	rename fatheroccej occSPej 
	
	rename census1970 occ_sp
	label var occSPej "Occupation of current husband" 
	tab occSPej maritalstatus, m
	
*------------------------------------------------------------------------------*
*------------------------------------------------------------------------------*
	
*********************
**** FATHER'S OCCUPATION
*********************

	rename fatherocc census1970 

* Crosswalk here the 1970 occupations to the ANES occs (merge adds one variable: fatheroccej)
	merge m:1 census1970 using ../Crosswalks/Crosswalk_1970Census_toANES.dta
	assert census1970==. if _merge==1 
	drop if _merge==2
	drop _merge

/*
	Note: Dummy for head of hh when R was growing up--unavailable. 
	Will code up alternate dummy for father as head of hh during 
	R's childhood. Will assume father was hh head if father occ is 
	reported.
*/
	gen headofhh_father_imputed =1 if fatheroccej!=. 
	tab headofhh_father_imputed ,m 
	tab fatheroccej, m
	label var headofhh_father_imputed "Impute dad when parent occ!=missing & no info about hh head when R was kid"
 
*****************************************************************************************************
* DUMMIES FOR WHEN WE KNOW WHY DAD OR MOM DIDN'T WORK (I.E. WHY FATHEROCCEJ/MOTHEROCCEJ IS MISSING) *
*****************************************************************************************************
	gen father_notworking =.
	replace father_notworking =1 if fatheroccej==. & census1970==991 //unemployed
	replace father_notworking =0 if fatheroccej!=.
	tab father_notworking, m
	
// mother_notworking--not available.

* Variable for father being either farm laborer or operator
	gen fatherfarm=0
	replace fatherfarm=. if fatheroccej==.
	replace fatherfarm=1 if fatheroccej==71 | fatheroccej==81
	label var fatherfarm "Father in farm occ."
	
	rename census1970 fatherocc
	
*------------------------------------------------------------------------------*
*------------------------------------------------------------------------------*

*********************
*** BIRTH COHORTS 
*********************

*** Fix dob
	tab dob_raw

	gen test = dob_raw - 301
	replace test = test/12
	gen test2 = floor(test)
	gen dob = 1925 + test2
	label var dob "Year of birth"
	
	gen test3 = test-test2
	gen latermonths = test3>=0.5
	drop test*
	
	replace dob= (dob_raw-900)+1900 if dob_raw>900
	
*** Fix age 
	gen age = 1970 - dob
	replace age = 1970 - dob - 1 if latermonths==1

	//Fix a couple values of age because age bin is wrong
	replace age=30 if age_bin==5 & age==29
	replace age=35 if age_bin==6 & age==34
	replace age=40 if age_bin==7 & age==39
	
	label var age "Age of woman"
	tab age, m

	gen agesq = age*age
	label var agesq "Age squared"
	
* Assign everyone the decade in which they were born.
	tab dob, m
	gen decade=.
	replace decade=1920 if dob>=1920 & dob<=1929
	replace decade=1930 if dob>=1930 
	label var decade "Decade of birth"
	
* Generate dummies for each decade
	tab decade, gen(decade_)
	
*------------------------------------------------------------------------------*
*------------------------------------------------------------------------------*

*********************
*** CREATE WEIGHT FOLLOWING CODEBOOK
*********************

* Black women are oversampled--re-weight them according to codebook instructions.
	tab maritalstatus if race_raw==1

	gen weight_nfs=1 if race_raw!=1
	replace weight=0.579 if race_raw==1 & maritalstatus==1 //black women, currently married
	replace weight=0.432 if race_raw==1 & maritalstatus==2 //black women, post married
	tab weight race_raw
	
*------------------------------------------------------------------------------*
*------------------------------------------------------------------------------*
	
*******************
**** INTERACTIONS
*******************

	global institution_list "black hs_ed coll_ed"

* Demean the variables that we will use
	foreach var in $institution_list {
	sum `var'
	gen `var'_dm = `var'- `r(mean)'
	local temp: var label `var' 
	label var `var'_dm "`temp', demeaned" 
	}

*********************
*** SAVE
*********************

	* Save relevant variables
	keep INT_NUM foreignborn-coll_ed_dm eduR R_* flag_* 

	* Unique identifier: INT_NUM
	ren INT_NUM id_nfs
	label var id_nfs "Interview number (unique identifier)"

	duplicates report id_nfs //no duplicates reported

	* Restrict sample and save
	compress
	sort id_nfs
	order id_nfs 
	save ./output/NFS70_analysis.dta, replace
